L1: Embedding Models¶
⏳ Note (Kernel Starting): This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.
In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
model
modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]
config_sentence_transformers.json: 0%| | 0.00/116 [00:00<?, ?B/s]
README.md: 0%| | 0.00/10.7k [00:00<?, ?B/s]
sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]
config.json: 0%| | 0.00/612 [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/90.9M [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/350 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/466k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/112 [00:00<?, ?B/s]
1_Pooling/config.json: 0%| | 0.00/190 [00:00<?, ?B/s]
Out[2]:
SentenceTransformer(
(0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
(1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
(2): Normalize()
)
💻 Access requirements.txt and helper.py files: 1) click on the "File" option on the top menu of the notebook and then 2) click on "Open". For more help, please see the "Appendix - Tips and Help" Lesson.
In [3]:
tokenized_data = model.tokenize(["walker walked a long walk"])
tokenized_data
Out[3]:
{'input_ids': tensor([[ 101, 5232, 2939, 1037, 2146, 3328, 102]]),
'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
In [4]:
model.tokenizer.convert_ids_to_tokens(tokenized_data["input_ids"][0])
Out[4]:
['[CLS]', 'walker', 'walked', 'a', 'long', 'walk', '[SEP]']
In [5]:
# Transformer consists of multiple stack modules. Tokens are an input
# of the first one, so we can ignore the rest.
first_module = model._first_module()
first_module.auto_model
Out[5]:
BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 384, padding_idx=0)
(position_embeddings): Embedding(512, 384)
(token_type_embeddings): Embedding(2, 384)
(LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0-5): 6 x BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=384, out_features=384, bias=True)
(key): Linear(in_features=384, out_features=384, bias=True)
(value): Linear(in_features=384, out_features=384, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=384, out_features=384, bias=True)
(LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=384, out_features=1536, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=1536, out_features=384, bias=True)
(LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=384, out_features=384, bias=True)
(activation): Tanh()
)
)
Input token embeddings¶
In [6]:
embeddings = first_module.auto_model.embeddings
embeddings
Out[6]:
BertEmbeddings( (word_embeddings): Embedding(30522, 384, padding_idx=0) (position_embeddings): Embedding(512, 384) (token_type_embeddings): Embedding(2, 384) (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) )
In [7]:
import torch
import plotly.express as px
device = torch.device("mps" if torch.has_mps else "cpu") # Use MPS for Apple, CUDA for others, or fallback to CPU
first_sentence = "vector search optimization"
second_sentence = "we learn to apply vector search optimization"
with torch.no_grad():
# Tokenize both texts
first_tokens = model.tokenize([first_sentence])
second_tokens = model.tokenize([second_sentence])
# Get the corresponding embeddings
first_embeddings = embeddings.word_embeddings(
first_tokens["input_ids"].to(device)
)
second_embeddings = embeddings.word_embeddings(
second_tokens["input_ids"].to(device)
)
first_embeddings.shape, second_embeddings.shape
Out[7]:
(torch.Size([1, 5, 384]), torch.Size([1, 9, 384]))
In [8]:
from sentence_transformers import util
distances = util.cos_sim(
first_embeddings.squeeze(),
second_embeddings.squeeze()
).cpu().numpy() # Move the tensor to the CPU and convert to a NumPy array
px.imshow(
distances,
x=model.tokenizer.convert_ids_to_tokens(
second_tokens["input_ids"][0]
),
y=model.tokenizer.convert_ids_to_tokens(
first_tokens["input_ids"][0]
),
text_auto=True,
)
Visualizing the input embeddings¶
In [9]:
token_embeddings = first_module.auto_model \
.embeddings \
.word_embeddings \
.weight \
.detach() \
.cpu() \
.numpy()
token_embeddings.shape
Out[9]:
(30522, 384)
In [10]:
import random
vocabulary = first_module.tokenizer.get_vocab()
sorted_vocabulary = sorted(
vocabulary.items(),
key=lambda x: x[1], # uses the value of the dictionary entry
)
sorted_tokens = [token for token, _ in sorted_vocabulary]
random.choices(sorted_tokens, k=100)
Out[10]:
['shaky', '[unused933]', 'bautista', 'robbers', 'patting', 'mbe', 'darius', 'code', 'peasant', 'la', 'خ', 'ridiculous', 'ridges', '₉', 'thirteenth', 'insects', 'caucasus', 'michele', 'syria', 'bark', 'nic', 'chrome', '1968', 'fey', 'shootout', '[unused799]', '##子', '##nna', 'wilderness', 'stalled', 'romney', 'liberated', 'feat', '##場', 'gwen', 'whereupon', '##wny', 'liar', 'morales', 'cinemas', '700', 'italiana', 'jozef', 'cretaceous', 'shortlisted', 'waldo', 'abundance', 'is', 'functional', 'inspector', 'he', 'campaign', '?', 'beneath', '##み', 'cal', 'axis', 'assert', 'lancashire', 'dorothy', 'prevalent', 'superhuman', '##sz', 'franciscan', 'ultimatum', 'suspicion', 'beak', '博', 'recycling', 'drinks', 'galician', 'appeal', 'narrows', 'hamish', 'soluble', 'deter', 'ท', '313', 'alvaro', 'handbook', 'compiled', 'misunderstanding', 'attitudes', 'printers', 'empty', 'tiles', 'turkic', 'yoon', 'tools', 'addition', 'octagonal', 'littered', 'darmstadt', '##rito', 'lindsey', 'avoid', 'vapor', '##born', 'prosperous', 'collapse']
In [11]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, metric="cosine", random_state=42)
tsne_embeddings_2d = tsne.fit_transform(token_embeddings)
tsne_embeddings_2d.shape
Out[11]:
(30522, 2)
In [12]:
token_colors = []
for token in sorted_tokens:
if token[0] == "[" and token[-1] == "]":
token_colors.append("red")
elif token.startswith("##"):
token_colors.append("blue")
else:
token_colors.append("green")
In [13]:
import plotly.graph_objs as go
scatter = go.Scattergl(
x=tsne_embeddings_2d[:, 0],
y=tsne_embeddings_2d[:, 1],
text=sorted_tokens,
marker=dict(color=token_colors, size=3),
mode="markers",
name="Token embeddings",
)
fig = go.FigureWidget(
data=[scatter],
layout=dict(
width=600,
height=900,
margin=dict(l=0, r=0),
)
)
fig.show()
Output token embeddings¶
In [14]:
output_embedding = model.encode(["walker walked a long walk"])
output_embedding.shape
Out[14]:
(1, 384)
In [15]:
output_token_embeddings = model.encode(
["walker walked a long walk"],
output_value="token_embeddings"
)
output_token_embeddings[0].shape
Out[15]:
torch.Size([7, 384])
In [16]:
first_sentence = "vector search optimization"
second_sentence = "we learn to apply vector search optimization"
with torch.no_grad():
first_tokens = model.tokenize([first_sentence])
second_tokens = model.tokenize([second_sentence])
first_embeddings = model.encode(
[first_sentence],
output_value="token_embeddings"
)
second_embeddings = model.encode(
[second_sentence],
output_value="token_embeddings"
)
distances = util.cos_sim(
first_embeddings[0],
second_embeddings[0]
)
In [17]:
px.imshow(
distances.cpu().numpy(), # Move the tensor to CPU and convert to a NumPy array
x=model.tokenizer.convert_ids_to_tokens(
second_tokens["input_ids"][0]
),
y=model.tokenizer.convert_ids_to_tokens(
first_tokens["input_ids"][0]
),
text_auto=True,
)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: